{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "pyLigthGBM\n",
    "=======\n",
    "\n",
    "Python wrapper for Microsoft [LightGBM](https://github.com/Microsoft/LightGBM)  \n",
    "Make sure that you have installed LightGBM [Installation-Guide](https://github.com/Microsoft/LightGBM/wiki/Installation-Guide)\n",
    "\n",
    "**GitHub      :  [https://github.com/ArdalanM/pyLightGBM](https://github.com/ArdalanM/pyLightGBM) **\n",
    "\n",
    "**Author of this notebook :** Evgeny BAZAROV <baz.evgenii@gmail.com>\n",
    "\n",
    "-------\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "import os, gc\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "from sklearn import metrics, model_selection\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "\n",
    "from pylightgbm.models import GBMRegressor"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "DATA\n",
    "-------\n",
    "\n",
    "For this example used **data from Kaggle competition Allstate Claims Severity**  \n",
    "You can download data here : https://www.kaggle.com/c/allstate-claims-severity/data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train data shape (188318, 132)\n",
      "Test data shape (125546, 131)\n"
     ]
    }
   ],
   "source": [
    "df_train = pd.read_csv(\"data/train.csv.zip\")\n",
    "print('Train data shape', df_train.shape)\n",
    "\n",
    "df_test = pd.read_csv(\"data/test.csv.zip\")\n",
    "print('Test data shape', df_test.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Extracting `loss` from train and `id` from test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "y = np.log(df_train['loss']+1).as_matrix().astype(np.float)\n",
    "id_test = np.array(df_test['id'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Merging train and test data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Merged data shape (313864, 132)\n"
     ]
    }
   ],
   "source": [
    "df = df_train.append(df_test, ignore_index=True)\n",
    "del df_test, df_train\n",
    "gc.collect()\n",
    "\n",
    "print('Merged data shape', df.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Droping not useful columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df.drop(labels=['loss', 'id'], axis=1, inplace=True)\n",
    "feature_list = df.columns.tolist()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Transfrom categorical features `cat` from 1 to 116"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "le = LabelEncoder()\n",
    "\n",
    "for col in df.columns.tolist():\n",
    "    if 'cat' in col:\n",
    "        df[col] = le.fit_transform(df[col])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "TRAIN, VALIDATION, TEST\n",
    "-------\n",
    "Split data into train, validation (for early stopping) and test set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "train-test split\n",
      "train transform\n",
      "\n",
      "Train shape (188318, 130)\n"
     ]
    }
   ],
   "source": [
    "print('train-test split')\n",
    "df_train, df_test = df.iloc[:len(y)], df.iloc[len(y):]\n",
    "del df, df_test\n",
    "gc.collect()\n",
    "\n",
    "print('train transform\\n')\n",
    "X = df_train.as_matrix()\n",
    "\n",
    "del df_train\n",
    "gc.collect()\n",
    "\n",
    "print('Train shape', X.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Bayesian Optimization of GBMRegressor params\n",
    "-------\n",
    "For more information about Bayesian Optimization please visit this github page :  \n",
    "https://github.com/fmfn/BayesianOptimization\n",
    "\n",
    "All goods goes to the author Fernando M. F. Nogueira"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import cross_val_score\n",
    "from sklearn.metrics import make_scorer, mean_absolute_error\n",
    "from bayes_opt import BayesianOptimization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def mae(y, y_pred):\n",
    "    return mean_absolute_error((np.exp(y_pred)-1), (np.exp(y)-1))\n",
    "\n",
    "def gbmr_eval(num_leaves, min_data_in_leaf, feature_fraction, bagging_fraction, seed=42):\n",
    "    gbmr = GBMRegressor(\n",
    "        exec_path='/path/to/your/LightGBM/lightgbm', # Change this to your LighGBM path \n",
    "        config='',\n",
    "        application='regression',\n",
    "        num_iterations=500,\n",
    "        learning_rate=0.1,\n",
    "        num_leaves=int(num_leaves),\n",
    "        tree_learner='serial',\n",
    "        num_threads=4,\n",
    "        min_data_in_leaf=int(min_data_in_leaf),\n",
    "        metric='l2',\n",
    "        feature_fraction=max(feature_fraction,0),\n",
    "        feature_fraction_seed=seed,\n",
    "        bagging_fraction=max(bagging_fraction,0),\n",
    "        bagging_freq=10,\n",
    "        bagging_seed=seed,\n",
    "        metric_freq=1,\n",
    "        verbose=False\n",
    "    )\n",
    "    \n",
    "    score =  cross_val_score(gbmr, X=X, y=y, scoring=make_scorer(score_func=mae, greater_is_better=False), cv=5, verbose=0, pre_dispatch=1)\n",
    "    return np.array(score).mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false,
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[31mInitialization\u001b[0m\n",
      "\u001b[94m-----------------------------------------------------------------------------------------------------------\u001b[0m\n",
      " Step |   Time |      Value |   bagging_fraction |   feature_fraction |   min_data_in_leaf |   num_leaves | \n",
      "    1 | 04m26s | \u001b[35m-1164.38437\u001b[0m | \u001b[32m            0.6703\u001b[0m | \u001b[32m            0.4968\u001b[0m | \u001b[32m           40.0666\u001b[0m | \u001b[32m    177.5792\u001b[0m | \n",
      "    2 | 07m09s | -1250.96450 |             0.3270 |             0.8248 |            21.8809 |     473.7783 | \n",
      "    3 | 03m39s | \u001b[35m-1144.07996\u001b[0m | \u001b[32m            0.8711\u001b[0m | \u001b[32m            0.5284\u001b[0m | \u001b[32m          127.2338\u001b[0m | \u001b[32m     22.1844\u001b[0m | \n",
      "    4 | 09m08s | -1179.26832 |             0.9244 |             0.9143 |           178.1341 |     374.0264 | \n",
      "    5 | 05m18s | -1212.30360 |             0.3862 |             0.5212 |            91.0555 |     475.1453 | \n",
      "    6 | 03m19s | -1145.65928 |             0.5126 |             0.3603 |           163.1402 |      29.9441 | \n",
      "    7 | 03m47s | -1153.17069 |             0.6146 |             0.5543 |            21.9241 |      65.2718 | \n",
      "    8 | 04m42s | -1189.71398 |             0.4649 |             0.6953 |            34.3985 |     234.7518 | \n",
      "    9 | 07m17s | -1188.86696 |             0.6355 |             0.7558 |           177.6290 |     402.9700 | \n",
      "   10 | 05m13s | -1155.38619 |             0.7992 |             0.7005 |           120.9871 |     136.0537 | \n",
      "   11 | 05m28s | -1184.31345 |             0.6560 |             0.4881 |            90.3574 |     354.9669 | \n",
      "   12 | 05m46s | -1189.32794 |             0.6442 |             0.6181 |            67.5643 |     376.6478 | \n",
      "   13 | 05m09s | -1191.86184 |             0.5718 |             0.4432 |            57.8956 |     399.1489 | \n",
      "   14 | 03m43s | -1147.74732 |             0.7222 |             0.5292 |            52.5914 |      58.2162 | \n",
      "   15 | 06m44s | -1173.78458 |             0.9366 |             0.6252 |            90.7892 |     368.6666 | \n",
      "\u001b[31mBayesian Optimization\u001b[0m\n",
      "\u001b[94m-----------------------------------------------------------------------------------------------------------\u001b[0m\n",
      " Step |   Time |      Value |   bagging_fraction |   feature_fraction |   min_data_in_leaf |   num_leaves | \n",
      "   16 | 03m12s | -1150.76459 |             1.0000 |             1.0000 |            15.0000 |      15.0000 | \n",
      "   17 | 03m22s | -1147.60424 |             1.0000 |             1.0000 |           200.0000 |      15.0000 | \n",
      "   18 | 04m10s | -1151.81655 |             0.5382 |             0.6131 |            87.1673 |      64.2613 | \n",
      "   19 | 02m33s | -1148.88296 |             1.0000 |             0.3000 |           140.8292 |      15.0000 | \n",
      "   20 | 03m47s | -1145.02413 |             1.0000 |             0.3000 |           200.0000 |     121.9517 | \n",
      "   21 | 03m46s | -1154.90483 |             0.3000 |             1.0000 |           120.0461 |      30.2717 | \n",
      "   22 | 05m54s | -1172.60793 |             1.0000 |             0.3000 |           200.0000 |     500.0000 | \n",
      "   23 | 03m13s | \u001b[35m-1141.37541\u001b[0m | \u001b[32m            1.0000\u001b[0m | \u001b[32m            0.3000\u001b[0m | \u001b[32m          200.0000\u001b[0m | \u001b[32m     65.3656\u001b[0m | \n",
      "   24 | 03m30s | -1162.71128 |             0.3623 |             0.5084 |            94.4783 |      87.5153 | \n",
      "   25 | 04m18s | -1151.84003 |             1.0000 |             0.3000 |           200.0000 |     201.6192 | \n",
      "   26 | 04m40s | -1165.98833 |             0.7695 |             0.3767 |           102.7805 |     272.3306 | \n",
      "   27 | 04m22s | -1145.77916 |             0.7499 |             0.5507 |           199.9165 |      59.9345 | \n",
      "   28 | 05m02s | -1172.29159 |             0.8436 |             0.4718 |            16.1583 |     386.3908 | \n",
      "   29 | 08m20s | -1173.79333 |             0.9452 |             0.9395 |            81.4048 |     330.9234 | \n",
      "   30 | 06m07s | -1177.14923 |             0.4428 |             0.9729 |            72.8035 |     141.8127 | \n",
      "   31 | 03m14s | -1141.62019 |             1.0000 |             0.3000 |           200.0000 |      84.6941 | \n",
      "   32 | 03m31s | -1146.87739 |             0.7013 |             0.3674 |           188.2707 |      79.7561 | \n",
      "   33 | 04m51s | -1194.72457 |             0.3994 |             0.5665 |           138.7609 |     348.3559 | \n",
      "   34 | 05m29s | -1217.12870 |             0.4458 |             0.5201 |            44.9735 |     490.5713 | \n",
      "   35 | 05m29s | -1180.35449 |             0.7137 |             0.5423 |            66.4621 |     356.6290 | \n",
      "   36 | 05m53s | -1177.94025 |             0.7202 |             0.7504 |            18.7171 |     323.4255 | \n",
      "   37 | 04m30s | -1158.58878 |             0.9680 |             0.3064 |            55.2655 |     284.4059 | \n",
      "   38 | 05m02s | -1178.51415 |             0.7343 |             0.5320 |            30.2899 |     358.9008 | \n",
      "   39 | 06m48s | -1210.61409 |             0.4931 |             0.9380 |            50.6096 |     391.7246 | \n",
      "   40 | 04m00s | -1156.24428 |             0.8081 |             0.5318 |            16.4243 |     141.0103 | \n",
      "   41 | 04m46s | -1188.41352 |             0.4894 |             0.4963 |            74.3967 |     287.1007 | \n",
      "   42 | 04m37s | -1158.04619 |             1.0000 |             0.3000 |           200.0000 |     281.0982 | \n",
      "   43 | 05m23s | -1176.91961 |             0.8328 |             0.3687 |            63.8295 |     426.1541 | \n",
      "   44 | 07m17s | -1185.41657 |             0.6901 |             0.8123 |           100.3116 |     343.1389 | \n",
      "   45 | 03m17s | -1148.73994 |             0.4748 |             0.3399 |           124.7893 |      17.7117 | \n",
      "   46 | 02m59s | -1143.35300 |             1.0000 |             0.3000 |           142.6143 |      33.9517 | \n",
      "   47 | 06m01s | -1166.48326 |             0.8903 |             0.7787 |            32.9032 |     260.1291 | \n",
      "   48 | 05m48s | -1210.64126 |             0.4272 |             0.7127 |            67.2857 |     370.8645 | \n",
      "   49 | 04m56s | -1166.94247 |             1.0000 |             0.3000 |            92.3024 |     402.3956 | \n",
      "   50 | 05m28s | -1142.56224 |             0.9692 |             0.8063 |           184.3579 |      49.1779 | \n",
      "   51 | 03m07s | \u001b[35m-1141.06710\u001b[0m | \u001b[32m            1.0000\u001b[0m | \u001b[32m            0.3000\u001b[0m | \u001b[32m          169.3443\u001b[0m | \u001b[32m     52.2018\u001b[0m | \n",
      "   52 | 04m04s | -1155.05742 |             0.3911 |             0.7517 |           172.0263 |      52.4654 | \n",
      "   53 | 03m08s | -1141.92134 |             1.0000 |             0.3000 |           154.1187 |      54.7625 | \n",
      "   54 | 05m53s | -1181.79224 |             0.7849 |             0.4428 |            47.3041 |     460.5153 | \n",
      "   55 | 03m11s | -1142.09297 |             1.0000 |             0.3000 |           162.6768 |      47.2758 | \n",
      "Final Results\n",
      "XGBOOST: -1141.067104\n"
     ]
    }
   ],
   "source": [
    "num_iter = 40\n",
    "init_points = 15\n",
    "\n",
    "gbmrBO = BayesianOptimization(gbmr_eval, \n",
    "                              {\n",
    "                                'num_leaves': (15, 500),\n",
    "                                'min_data_in_leaf': (15, 200),\n",
    "                                'feature_fraction': (0.3,1),\n",
    "                                'bagging_fraction': (0.3,1),\n",
    "                              }\n",
    "                             )\n",
    "\n",
    "gbmrBO.maximize(init_points=init_points, n_iter=num_iter)\n",
    "\n",
    "print('Final Results')\n",
    "print('XGBOOST: %f' % gbmrBO.res['max']['max_val'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**After finishing the work we can observe best pramas founded by Bayesian Optimization :**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'bagging_fraction': 1.0,\n",
       " 'feature_fraction': 0.29999999999999999,\n",
       " 'min_data_in_leaf': 169.3442542624897,\n",
       " 'num_leaves': 52.201833585663096}"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gbmrBO.res['max']['max_params']"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.4.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
